This R notebook is a rapid analysi of the cohort data produced as part of the data synthesis of citizen science projects
Lets load in the data, it is currently seperate sheets in an excel file. Lets pull out the seperate sheets and assign to data.frames
summary <- read_excel(path = 'data/BES citsci data summary v3.xlsx', sheet = 1)
metadata <- read_excel(path = 'data/BES citsci data summary v3.xlsx', sheet = 2)
retention_tasks <- read_excel(path = 'data/BES citsci data summary v3.xlsx', sheet = 3)
retention_time <- read_excel(path = 'data/BES citsci data summary v3.xlsx', sheet = 4)
ppt_inequality <- read_excel(path = 'data/BES citsci data summary v3.xlsx', sheet = 5)
I think I need to spread the cohort data so that time is represented in columns
ret <- pivot_wider(data = retention_time, names_from = 'Session', values_from = 'NumberOfPeople')
head(ret)
## # A tibble: 6 x 28
## ProjectCode Cohort `1` `2` `3` `4` `5` `6` `7` `8` `9` `10`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 CNC_SF 1 779 211 182 153 NA NA NA NA NA NA
## 2 CNC_SF 2 NA 1536 385 299 NA NA NA NA NA NA
## 3 CNC_SF 3 NA NA 1697 450 NA NA NA NA NA NA
## 4 CNC_SF 4 NA NA NA 2076 NA NA NA NA NA NA
## 5 CNC_LA 1 891 156 158 126 NA NA NA NA NA NA
## 6 CNC_LA 2 NA 732 215 161 NA NA NA NA NA NA
## # ... with 16 more variables: `11` <dbl>, `12` <dbl>, `13` <dbl>, `14` <dbl>,
## # `15` <dbl>, `16` <dbl>, `17` <dbl>, `18` <dbl>, `19` <dbl>, `20` <dbl>,
## # `21` <dbl>, `22` <dbl>, `23` <dbl>, `24` <dbl>, `25` <dbl>, `26` <dbl>
# easy to view but not he format for ggplot
retention_time$CohortSession <- (retention_time$Session + 1) - retention_time$Cohort
retention_time$ProjectCohort <- paste(retention_time$ProjectCode,
retention_time$Cohort,
sep = '_')
retention_time$ProjectName <- NA
retention_time$ProjectName <- summary$ProjectName[match(retention_time$ProjectCode, summary$ProjectCode)]
retention_time$ProjectName[is.na(retention_time$ProjectName)] <-
as.character(retention_time$ProjectCode[is.na(retention_time$ProjectName)])
That works. So now can we do an initial visualisation of the data
p <- ggplot(retention_time, aes(x = CohortSession, y = NumberOfPeople, group = ProjectCohort)) +
geom_line(aes(colour = ProjectName)) +
theme(legend.position = "none")
ggplotly(p, tooltip = c('group'))
## Warning: `group_by_()` is deprecated as of dplyr 0.7.0.
## Please use `group_by()` instead.
## See vignette('programming') for more help
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_warnings()` to see where this warning was generated.
Okay now we need to average across cohorts in the same project and rescale so they all start with the same value
# Rescale
rescale <- function(x){
(x/max(x)) * 100
}
retention_time$NumberOfPeopleRescaled <- NA
for(i in unique(retention_time$ProjectCohort)){
retention_time$NumberOfPeopleRescaled[retention_time$ProjectCohort == i] <-
rescale(retention_time$NumberOfPeople[retention_time$ProjectCohort == i])
}
p <- ggplot(retention_time, aes(x = CohortSession, y = NumberOfPeopleRescaled, group = ProjectCohort)) +
geom_line(aes(colour = ProjectName)) +
theme(legend.position = "none")
ggplotly(p, tooltip = c('group'))
And now average across projects
# group and average
av_ret <- tapply(retention_time$NumberOfPeopleRescaled,
INDEX = list(retention_time$ProjectCode,
retention_time$CohortSession),
FUN = mean)
# Put into long format
av_ret <- melt(av_ret, varnames = c('ProjectCode', 'CohortSession'), value.name = 'NumberOfPeopleRescaled')
av_ret$ProjectName <- summary$ProjectName[match(av_ret$ProjectCode, summary$ProjectCode)]
av_ret$ProjectName[is.na(av_ret$ProjectName)] <-
as.character(av_ret$ProjectCode[is.na(av_ret$ProjectName)])
p <- ggplot(av_ret, aes(x = CohortSession, y = NumberOfPeopleRescaled,
group = ProjectName)) +
geom_line(aes(colour = ProjectName)) +
geom_point(aes(colour = ProjectName)) +
scale_y_continuous(trans='log10') +
theme(legend.position = "none")
ggplotly(p, tooltip = c('group'))
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis
After the first year these actually look very similar
retention_time$NumberOfPeopleRescaled <- NA
retention_time <- retention_time[retention_time$CohortSession != 1,]
for(i in unique(retention_time$ProjectCohort)){
retention_time$NumberOfPeopleRescaled[retention_time$ProjectCohort == i] <-
rescale(retention_time$NumberOfPeople[retention_time$ProjectCohort == i])
}
# group and average
av_ret2 <- tapply(retention_time$NumberOfPeopleRescaled,
INDEX = list(retention_time$ProjectCode,
retention_time$CohortSession),
FUN = mean)
# Put into long format
av_ret2 <- melt(av_ret2, varnames = c('ProjectCode', 'CohortSession'), value.name = 'NumberOfPeopleRescaled')
av_ret2$ProjectName <-
summary$ProjectName[match(av_ret2$ProjectCode,
summary$ProjectCode)]
av_ret2$ProjectName[is.na(av_ret2$ProjectName)] <-
as.character(av_ret2$ProjectCode[is.na(av_ret2$ProjectName)])
p <- ggplot(av_ret2, aes(x = CohortSession, y = NumberOfPeopleRescaled,
group = ProjectName)) +
geom_line(aes(colour = ProjectName)) +
geom_point(aes(colour = ProjectName)) +
scale_y_continuous(trans = 'log10') +
theme(legend.position = "none")
ggplotly(p, tooltip = c('group'))
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Transformation introduced infinite values in continuous y-axis